Assignment 03

Author
Affiliation

Dakota Alder - dkalder

Boston University

Published

September 22, 2025

Modified

September 25, 2025

1 Loading the Dataset

import pandas as pd
import plotly.express as px
import plotly.io as pio
from pyspark.sql import SparkSession
import re
import numpy as np
import plotly.graph_objects as go
from pyspark.sql.functions import col, split, explode, regexp_replace, transform, when
from pyspark.sql import functions as F
from pyspark.sql.functions import col, monotonically_increasing_id

np.random.seed(42)

pio.renderers.default = "notebook"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("data/lightcast_job_postings.csv")
df.createOrReplaceTempView("job_postings")

# Show Schema and Sample Data
#print("---This is Diagnostic check, No need to print it in the final doc---")

#df.printSchema() # comment this line when rendering the submission
#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/25 03:04:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:>                                                          (0 + 1) / 1]                                                                                [Stage 1:>                                                          (0 + 1) / 1]                                                                                25/09/25 03:05:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

2 Data Preparation

[Stage 2:>                                                          (0 + 1) / 1]                                                                                [Stage 3:>                                                          (0 + 1) / 1]                                                                                [Stage 4:>                                                          (0 + 1) / 1]                                                                                
Medians: 87295.0 130042.0 115024.0
[Stage 5:>                                                          (0 + 1) / 1]                                                                                
Data Cleaning Complete. Rows retained: 72498

3 Salary Distribution by Employment Type

[Stage 6:>                                                          (0 + 1) / 1]                                                                                

4 Salary Distribution by Industry

#Question 2 Code

#Select Industry and Salary Columns
pdf = df.select("NAICS2_NAME", "SALARY_FROM").toPandas()


#Create box plot with Horizontal grid lines
fig = px.box(
  pdf,
  x="NAICS2_NAME",
  y="SALARY_FROM",
  title="Salary Distribution by Industry",
  color_discrete_sequence=["blue"],
  boxmode="group",
  points="all",
)



fig.update_layout(
  title=dict(
      text="Salary Distribution by Industry",
      font=dict(size=25, family="Arial", color="black", weight="bold")
  ),
  xaxis=dict(
    title=dict(text="Industry", font=dict(size=24, family="Arial", color="black", weight="bold")),
    tickangle=45,
    tickfont=dict(size=12, family="Arial", color="black",weight="bold"),
    showline=True,
    linewidth=2,
    linecolor="black",
    mirror=True,
    showgrid=False,
    categoryorder="array",
    categoryarray=sorted_employment_types.tolist()
  ),
  yaxis=dict(
    title=dict(text="Salary (K $)", font=dict(size=24, family="Arial", color="black", weight="bold")),
    tickvals=[0,50000,100000,150000,200000,250000,300000,350000,400000,450000,500000],
    ticktext=["0","50K","100K","150K","200K","250K","300K","350K","400K","450K","500K"],
    tickfont=dict(size=18, family="Arial", color="black",weight="bold"),
    showline=True,
    linewidth=2,
    linecolor="black",
    mirror=True,
    showgrid=False,
    gridcolor="lightgray",
    gridwidth=.5
  ),
  font=dict(family="Arial", size=16, color="black"),
  boxgap=0.7,
  plot_bgcolor="white",
  paper_bgcolor="white",
  showlegend=False,
  height=800,
  width=1000,
)
fig.show()
fig.write_html("output/Q2.html")
#fig.write_image("output/Q2.svg", width=850, height=500, scale=1)
[Stage 7:>                                                          (0 + 1) / 1]                                                                                

5 Question 2: Salary Analysis by ONET Occupation Type

#Query the required data with Spark SQL

onet_type = spark.sql("""
  SELECT LOT_OCCUPATION_NAME AS Occupation_Name, PERCENTILE(SALARY, 0.5) As Median_Salary, COUNT(*) As Job_Postings
  FROM job_postings
  GROUP BY LOT_OCCUPATION_NAME
  ORDER BY Job_Postings DESC
  LIMIT 10
  """)

onet_pd = onet_type.toPandas()

fig = px.scatter(
  onet_pd,
  x="Occupation_Name",
  y="Median_Salary",
  size="Job_Postings",
  title="Salary Analysis by Occupation Name (Bubble Chart)",
  labels={
    "Occupation_Name": "Occupation Name",
    "Median_Salary": "Median Salary",
    "Job_Postings": "Number of Job Postings"
  },
  hover_name="Occupation_Name",
  size_max=60,
  width=1000,
  height=600,
  color="Job_Postings",
  color_continuous_scale="Plasma"
)

fig.update_layout(
  font_family="Arial",
  font_size=12,
  title_font_size=24,
  xaxis_title="Occupation Name",
  yaxis_title="Median Salary",
  plot_bgcolor="white",
  xaxis=dict(
    tickangle=-45,
    showline=True,
    linecolor="black"
  ),
  yaxis=dict(
    showline=True,
    linecolor="black"
  )
)
fig.show()
fig.write_html("output/Q3.html")
[Stage 8:>                                                          (0 + 1) / 1]                                                                                

6 Salary by Education Level

7 Salary by Remote Work Type

8 Salary Histograms for Remote Type Work